/*	$OpenBSD: memset.S,v 1.1.1.1 2006/10/10 22:07:10 miod Exp $	*/
/*	$NetBSD: memset.S,v 1.1 2005/12/20 19:28:50 christos Exp $	*/

/*-
 * Copyright (c) 2002 SHIMIZU Ryo.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. The name of the author may not be used to endorse or promote products
 *    derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include <machine/asm.h>

#define	REG_PTR				r0
#define	REG_TMP1			r1

#ifdef BZERO
# define	REG_C			r2
# define	REG_DST			r4
# define	REG_LEN			r5
#else
# define	REG_DST0		r3
# define	REG_DST			r4
# define	REG_C			r5
# define	REG_LEN			r6
#endif

#ifdef BZERO
ENTRY(bzero)
#else
ENTRY(memset)
	mov	REG_DST,REG_DST0	/* for return value */
#endif
	/* small amount to fill ? */
	mov	#28,REG_TMP1
	cmp/hs	REG_TMP1,REG_LEN	/* if (len >= 28) goto large; */
	bt/s	large
	mov	#12,REG_TMP1		/* if (len >= 12) goto small; */
	cmp/hs	REG_TMP1,REG_LEN
	bt/s	small
#ifdef BZERO
	mov	#0,REG_C
#endif
	/* very little fill (0 ~ 11 bytes) */
	tst	REG_LEN,REG_LEN
	add	REG_DST,REG_LEN
	bt/s	done
	add	#1,REG_DST

	/* unroll 4 loops */
	cmp/eq	REG_DST,REG_LEN
1:	mov.b	REG_C,@-REG_LEN
	bt/s	done
	cmp/eq	REG_DST,REG_LEN
	mov.b	REG_C,@-REG_LEN
	bt/s	done
	cmp/eq	REG_DST,REG_LEN
	mov.b	REG_C,@-REG_LEN
	bt/s	done
	cmp/eq	REG_DST,REG_LEN
	mov.b	REG_C,@-REG_LEN
	bf/s	1b
	cmp/eq	REG_DST,REG_LEN
done:
#ifdef BZERO
	rts
	nop
#else
	rts
	mov	REG_DST0,r0
#endif


small:
	mov	REG_DST,r0
	tst	#1,r0
	bt/s	small_aligned
	mov	REG_DST,REG_TMP1
	shll	REG_LEN
	mova	1f,r0			/* 1f must be 4bytes aligned! */
	add	#16,REG_TMP1		/* REG_TMP1 = dst+16; */
	sub	REG_LEN,r0
	jmp	@r0
	mov	REG_C,r0

	.align	2
	mov.b	r0,@(15,REG_TMP1)
	mov.b	r0,@(14,REG_TMP1)
	mov.b	r0,@(13,REG_TMP1)
	mov.b	r0,@(12,REG_TMP1)
	mov.b	r0,@(11,REG_TMP1)
	mov.b	r0,@(10,REG_TMP1)
	mov.b	r0,@(9,REG_TMP1)
	mov.b	r0,@(8,REG_TMP1)
	mov.b	r0,@(7,REG_TMP1)
	mov.b	r0,@(6,REG_TMP1)
	mov.b	r0,@(5,REG_TMP1)
	mov.b	r0,@(4,REG_TMP1)
	mov.b	r0,@(3,REG_TMP1)
	mov.b	r0,@(2,REG_TMP1)
	mov.b	r0,@(1,REG_TMP1)
	mov.b	r0,@REG_TMP1
	mov.b	r0,@(15,REG_DST)
	mov.b	r0,@(14,REG_DST)
	mov.b	r0,@(13,REG_DST)
	mov.b	r0,@(12,REG_DST)
	mov.b	r0,@(11,REG_DST)
	mov.b	r0,@(10,REG_DST)
	mov.b	r0,@(9,REG_DST)
	mov.b	r0,@(8,REG_DST)
	mov.b	r0,@(7,REG_DST)
	mov.b	r0,@(6,REG_DST)
	mov.b	r0,@(5,REG_DST)
	mov.b	r0,@(4,REG_DST)
	mov.b	r0,@(3,REG_DST)
	mov.b	r0,@(2,REG_DST)
	mov.b	r0,@(1,REG_DST)
#ifdef BZERO
	rts
1:	mov.b	r0,@REG_DST
#else
	mov.b	r0,@REG_DST
1:	rts
	mov	REG_DST0,r0
#endif


/* 2 bytes aligned small fill */
small_aligned:
#ifndef BZERO
	extu.b	REG_C,REG_TMP1		/* REG_C = ??????xx, REG_TMP1 = ????00xx */
	shll8	REG_C			/* REG_C = ????xx00, REG_TMP1 = ????00xx */
	or	REG_TMP1,REG_C		/* REG_C = ????xxxx */
#endif

	mov	REG_LEN,r0
	tst	#1,r0			/* len is aligned? */
	bt/s	1f
	add	#-1,r0
	mov.b	REG_C,@(r0,REG_DST)	/* fill last a byte */
	mov	r0,REG_LEN
1:

	mova	1f,r0			/* 1f must be 4bytes aligned! */
	sub	REG_LEN,r0
	jmp	@r0
	mov	REG_C,r0

	.align	2
	mov.w	r0,@(30,REG_DST)
	mov.w	r0,@(28,REG_DST)
	mov.w	r0,@(26,REG_DST)
	mov.w	r0,@(24,REG_DST)
	mov.w	r0,@(22,REG_DST)
	mov.w	r0,@(20,REG_DST)
	mov.w	r0,@(18,REG_DST)
	mov.w	r0,@(16,REG_DST)
	mov.w	r0,@(14,REG_DST)
	mov.w	r0,@(12,REG_DST)
	mov.w	r0,@(10,REG_DST)
	mov.w	r0,@(8,REG_DST)
	mov.w	r0,@(6,REG_DST)
	mov.w	r0,@(4,REG_DST)
	mov.w	r0,@(2,REG_DST)
#ifdef BZERO
	rts
1:	mov.w	r0,@REG_DST
#else
	mov.w	r0,@REG_DST
1:	rts
	mov	REG_DST0,r0
#endif



	.align	2
large:
#ifdef BZERO
	mov	#0,REG_C
#else
	extu.b	REG_C,REG_TMP1		/* REG_C = ??????xx, REG_TMP1 = ????00xx */
	shll8	REG_C			/* REG_C = ????xx00, REG_TMP1 = ????00xx */
	or	REG_C,REG_TMP1		/* REG_C = ????xx00, REG_TMP1 = ????xxxx */
	swap.w	REG_TMP1,REG_C		/* REG_C = xxxx????, REG_TMP1 = ????xxxx */
	xtrct	REG_TMP1,REG_C		/* REG_C = xxxxxxxx */
#endif

	mov	#3,REG_TMP1
	tst	REG_TMP1,REG_DST
	mov	REG_DST,REG_PTR
	bf/s	unaligned_dst
	add	REG_LEN,REG_PTR		/* REG_PTR = dst + len; */
	tst	REG_TMP1,REG_LEN
	bf/s	unaligned_len

aligned:
	/* fill 32*n bytes */
	mov	#32,REG_TMP1
	cmp/hi	REG_LEN,REG_TMP1
	bt	9f
	.align	2
1:	sub	REG_TMP1,REG_PTR
	mov.l	REG_C,@REG_PTR
	sub	REG_TMP1,REG_LEN
	mov.l	REG_C,@(4,REG_PTR)
	cmp/hi	REG_LEN,REG_TMP1
	mov.l	REG_C,@(8,REG_PTR)
	mov.l	REG_C,@(12,REG_PTR)
	mov.l	REG_C,@(16,REG_PTR)
	mov.l	REG_C,@(20,REG_PTR)
	mov.l	REG_C,@(24,REG_PTR)
	bf/s	1b
	mov.l	REG_C,@(28,REG_PTR)
9:

	/* fill left 4*n bytes */
	cmp/eq	REG_DST,REG_PTR
	bt	9f
	add	#4,REG_DST
	cmp/eq	REG_DST,REG_PTR
1:	mov.l	REG_C,@-REG_PTR
	bt/s	9f
	cmp/eq	REG_DST,REG_PTR
	mov.l	REG_C,@-REG_PTR
	bt/s	9f
	cmp/eq	REG_DST,REG_PTR
	mov.l	REG_C,@-REG_PTR
	bt/s	9f
	cmp/eq	REG_DST,REG_PTR
	mov.l	REG_C,@-REG_PTR
	bf/s	1b
	cmp/eq	REG_DST,REG_PTR
9:
#ifdef BZERO
	rts
	nop
#else
	rts
	mov	REG_DST0,r0
#endif


unaligned_dst:
	mov	#1,REG_TMP1
	tst	REG_TMP1,REG_DST	/* if (dst & 1) {               */
	add	#1,REG_TMP1
	bt/s	2f
	tst	REG_TMP1,REG_DST
	mov.b	REG_C,@REG_DST		/*   *dst++ = c;                */
	add	#1,REG_DST
	tst	REG_TMP1,REG_DST
2:					/* }                            */
					/* if (dst & 2) {               */
	bt	4f
	mov.w	REG_C,@REG_DST		/*   *(u_int16_t*)dst++ = c;    */
	add	#2,REG_DST
4:					/* }                            */


	tst	#3,REG_PTR		/* if (ptr & 3) {               */
	bt/s	4f			/*                              */
unaligned_len:
	tst	#1,REG_PTR		/*   if (ptr & 1) {             */
	bt/s	2f
	tst	#2,REG_PTR
	mov.b	REG_C,@-REG_PTR		/*     --ptr = c;               */
2:					/*   }                          */
					/*   if (ptr & 2) {             */
	bt	4f
	mov.w	REG_C,@-REG_PTR		/*     *--(u_int16_t*)ptr = c;  */
4:					/*   }                          */
					/* }                            */

	mov	REG_PTR,REG_LEN
	bra	aligned
	sub	REG_DST,REG_LEN

